#include "GenomeNTdata.h"

CGenomeNTdata::CGenomeNTdata(void)
{
    this->initialization();
}
CGenomeNTdata::CGenomeNTdata(const char* DataSetListFile)
{
    this->initialization();

    ifstream ListFile(DataSetListFile);
    char inputfastaFilename[MAX_LINE];
    while (GetNextFilenameFromListFile(ListFile, inputfastaFilename)) {
        //New a CchromosomeNTdata object and add it genome
        if (fileExist(inputfastaFilename)) {
            this->addChromosome(inputfastaFilename);
        } else {
            LOG_INFO("Info %d: Can't open %s in %s.\r", CONFIG_LOG,\
                     inputfastaFilename, get_working_directory().c_str());
        }
    }
    myStrCpy(this->refName, getBasename(DataSetListFile).c_str(), MAX_LINE);
    if (this->iNo_of_chromosome > 0) {
        LOG_INFO("Info %d: %d seqs are in %s.\r", CONFIG_LOG,\
                 this->iNo_of_chromosome, DataSetListFile);
    } else {
        LOG_INFO("Info %d: No seqs are in %s.\r", WARNING_LOG, DataSetListFile);
    }
    this->checkRefsNames();
}

CGenomeNTdata::~CGenomeNTdata(void)
{
    unsigned int i;
    for (i = 0; i < GENOME_CAPACITY; i++) {
        delete this->paChromosomes[i];
        this->paChromosomes[i] = NULL;
    }
    // this->paChromosomes is not new
}

int CGenomeNTdata::initialization(void)
{
    unsigned int i;
    this->iGenomeSize = 0;
    this->refName[0] = '\0';
    this->iNo_of_chromosome = 0;
    this->caKmer[0] = '\0';
    for (i = 0; i < GENOME_CAPACITY; i++) {
        IndexCovertTable[i] = 0;
        this->paChromosomes[i] = NULL;
    }
    return(0);
}

// delete the spaced used character string of each chromosome
int CGenomeNTdata::freeChromosomeSpace(void)
{
    for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) {
        delete [] this->paChromosomes[i]->caChromosome;
        this->paChromosomes[i]->caChromosome = NULL;
    }
    return(0);
}

/* This function add one more chromosome in the genome set,
 * return the size of newly add in chromosome */
unsigned int CGenomeNTdata::addChromosome(const char* chromosomeFileName, bool bFastaFormat)
{
    // Concatenate the chromosome file name as the name of the reference genome (No ext file name)
    string newRefName = string(refName).append(getBasename(chromosomeFileName));
    strcpy(refName, newRefName.c_str());

    if (fileExist(chromosomeFileName)) {
        CchromosomeNTdata* pChr = NULL;
        pChr = new CchromosomeNTdata(chromosomeFileName, bFastaFormat);
        if (pChr != NULL) {
            this->paChromosomes[iNo_of_chromosome] = pChr;
            if (iNo_of_chromosome < GENOME_CAPACITY - 1) {
                if (iNo_of_chromosome == 0) {
                    this->IndexCovertTable[iNo_of_chromosome] = pChr->iChromosome_size;
                    this->iGenomeSize = pChr->iChromosome_size;
                } else { /*iNo_of_chromosome > 0 */
                    this->IndexCovertTable[iNo_of_chromosome] =
                        this->IndexCovertTable[iNo_of_chromosome-1] + pChr->iChromosome_size;
                    this->iGenomeSize += pChr->iChromosome_size;
                }
                cout << "Reference " << chromosomeFileName << " has " << pChr->iChromosome_size;
                cout << " bases.\n" << endl;
            } else {
                cout << "\n Add too many chromosomes as the reference." << endl;
                cout << "Increase the constant GENOME_CAPACITY in the code." << endl;
            }
            this->iNo_of_chromosome++;
            return(pChr->iChromosome_size);
        } else {
            cout << chromosomeFileName << " is in a wrong format." << endl;
            return(0);
        }
    } else {
        cout << chromosomeFileName << " are not found in ";
        cout << get_working_directory() << endl;
        return (0); // can not open file
    }
}

char* CGenomeNTdata::genomeLocusID2Kmer(unsigned int uiKmer_Length, unsigned int genomeLocusID)
{
    this->caKmer[0] = '\0'; //initialize
    if (this->paChromosomes[0] == NULL || this->paChromosomes[0]->caChromosome == NULL) {
        cout << "Warning!! No chromosome or has been free" << endl;
        return(this->caKmer);
    }

    if (genomeLocusID == BAD_GENOME_INDEX) {
        cout << "Wrong Genome Locus" << endl;//Exception, wrong genomeLocusID
    } else {
        unsigned int chrID = this->genomeIndex2chrID(genomeLocusID);
        if (chrID >= this->iNo_of_chromosome) {
            cout << "error in the chrID" << endl;
        } else {
            unsigned int chrLocusID;
            if (chrID == 0) {
                chrLocusID = genomeLocusID;
            } else {
                chrLocusID = genomeLocusID -  this->IndexCovertTable[chrID-1];
            }
            if (chrLocusID < this->paChromosomes[chrID]->iChromosome_size) {
                strncpy(this->caKmer, &(this->paChromosomes[chrID]->caChromosome[chrLocusID]), uiKmer_Length);
            } else {
                cout << "Error chrLocusID " << chrID << ',' << chrLocusID << endl;
                ERR;
            }
            this->caKmer[uiKmer_Length] = '\0';
        }
    }
    return(this->caKmer);
}

/*  This function will covert the position described by (chromosome ID and chromosome locus ID),
 *	a number pair to genome locus index recorded in the table list.
 */
unsigned int CGenomeNTdata::chrIndex2genomelocusID(unsigned int iChrID, unsigned int iChrLocusID)
{
    if (iChrID > 0) {
        unsigned int returnValue = this->IndexCovertTable[iChrID-1] + iChrLocusID;
        return(returnValue);
    } else
        return(iChrLocusID);
}

unsigned int CGenomeNTdata::genomeIndex2chrID(unsigned int igenomeLocusID)
{
    unsigned int i;

    for (i = 0; i < this->iNo_of_chromosome; i++) {
        if (igenomeLocusID < this->IndexCovertTable[i]) {
            return(i);
        }
    }
    cout << "Unknown Chromosome" << endl;
    return(this->iNo_of_chromosome);
}

/* This function will covert to genome locus index recorded in the table list
 * to the locus index recorded in the table list
 */
unsigned int CGenomeNTdata::genomeLocusID2chrIndex(unsigned int igenomeLocusID)
{
    unsigned int i;
    for (i = 0; i < this->iNo_of_chromosome; i++) {
        if (igenomeLocusID < this->IndexCovertTable[i]) {
            break;
        }
    }

    if (i == 0) {
        return(igenomeLocusID);
    } else if (i < this->iNo_of_chromosome) {
        return(igenomeLocusID - this->IndexCovertTable[i-1]);
    } else {
        cout << "Wrong genome locus" << igenomeLocusID << endl;
    }
    return(igenomeLocusID);
}

void CGenomeNTdata::checkRefsNames(void)
{
    set<string> refNames;
    for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) {
        vector<CGene>* chrRefNs = &(this->paChromosomes[i]->geneVec.table);
        if (chrRefNs->size() > 0) {
            vector<CGene>::iterator it = chrRefNs->begin();
            for (; it != chrRefNs->end(); it++) {
                if (!refNames.insert(it->name).second) {
                    LOG_INFO("Info %d: ref %s in %s are duplicated.\n",\
                             WARNING_LOG, it->name.c_str(), this->paChromosomes[i]->caInputFileName);
                }
            }
        }
    }
}

// This procedure returns a large object by copy.
vector<CGene> CGenomeNTdata::getRefNamesLengths(void)
{
    vector<CGene> refNamesLengthes;
    for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) {
        CchromosomeNTdata* chr = this->paChromosomes[i];
        vector<CGene>* chrGenes = &(chr->geneVec.table);
        if ((int)(chrGenes->size()) > 0) {
            int originalSize = (int)refNamesLengthes.size();
            copy(chrGenes->begin(), chrGenes->end(), std::back_inserter(refNamesLengthes));
            // change the meaning CGene.startIndex to length of the gene
            vector<CGene>::iterator it = refNamesLengthes.begin() + originalSize;
            for (; (it + 1) != refNamesLengthes.end(); it++) {
                it->startIndex = ((it + 1)->startIndex - it->startIndex);
            }
            it->startIndex = chr->iChromosome_size - it->startIndex;
        } else {
            refNamesLengthes.push_back(CGene(getBasename(chr->caInputFileName), chr->iChromosome_size));
        }
    }
    return(refNamesLengthes);
}

unsigned int BruteForceSearch(CGenomeNTdata& genome, char* Kmer)
{
    unsigned int kmer_length = (unsigned int) strlen(Kmer);
    unsigned int chrID = 0, chromsomeIndex = 0; // Best alignments chrID and index.
    unsigned int i, j, k;
    unsigned MinDiff = kmer_length;
    bool reverseIsBetter = false;

    for (i = 0; i < genome.iNo_of_chromosome; i++) {
        for (j = 0; j < genome.paChromosomes[i]->iChromosome_size - kmer_length; j++) {
            unsigned int diff = 0;
            for (k = 0; k < kmer_length; k++) {
                if (genome.paChromosomes[i]->caChromosome[j + k] != Kmer[k])
                    diff++;
            }
            if (diff < MinDiff) {
                MinDiff = diff;
                chrID = i;
                chromsomeIndex = j;
                reverseIsBetter = false;
            }
            //Also check the Reveres direction
            for (diff = 0, k = 0; k < kmer_length; k++) {
                char nt = genome.paChromosomes[i]->caChromosome[j+k];
                nt = complimentBase(nt);//Get the complement base
                if (nt != Kmer[kmer_length - 1 - k])
                    diff++;
            }
            if (diff < MinDiff) {
                MinDiff = diff;
                chrID = i;
                chromsomeIndex = j;
                reverseIsBetter = true;
            }
        }
    }
    if (MinDiff <= 3) {
        cout << "Best alignment is located at ";
        unsigned int genomeIndex = genome.chrIndex2genomelocusID(chrID, chromsomeIndex);
        cout << genomeIndex << ':' << genome.genomeLocusID2Kmer(kmer_length, genomeIndex);
        if (reverseIsBetter)
            cout << "reverse is better" << endl;
    }
    return(MinDiff);
}


